In [1]:
def describe(table,include='all',showNullIndexes=False):
import numpy as np
print table.describe(include=include)
print "\n-> Has Nil? (How many?)"
has_nil = table.isnull().apply(np.sum)
print has_nil
if showNullIndexes:
for c in has_nil.index:
if not has_nil[c]: continue
print "\n-> Indexes where column '{}' is null:".format(c)
print table[table[c].isnull()].index.values
In [2]:
import matplotlib
class PlotDistro:
@staticmethod
def boxhist(data,column,**kw):
from matplotlib import pyplot as plt
fig,axs = plt.subplots(1,2)
fig.set_size_inches(15,5)
data[column].plot.box(ax=axs[0])
bins = kw['bins'] if kw.has_key('bins') else 10
log = kw['log'] if kw.has_key('log') else False
data[column].plot.hist(ax=axs[1],bins=bins,log=log)
return plt
In [3]:
import numpy
def histogram2stepfunction(hist,bins):
hist = hist.tolist()
hh = hist+hist
hh[::2] = hist
hh[1::2] = hist
hist = hh[:]
bins = bins.tolist()
bb = bins[:-1]+bins[1:]
bb.sort()
bins = bb[:]
assert len(hist)==len(bins)
return hist,bins
class Distro:
@staticmethod
def binning(vector,nbins,xmin=None,xmax=None,spacing='linear'):
"""
"""
import numpy as np
spacing_function = {'linear' : np.linspace}
if spacing is not 'linear':
spacing = 'linear'
xmin = vector.min() if xmin is None else xmin
xmax = vector.max() if xmax is None else xmax
nbins = 10 if not (nbins >= 1) else nbins
bins = spacing_function[spacing](xmin,xmax,nbins)
return bins
@staticmethod
def histogram(vector,bins):
"""
"""
import numpy as np
h,b = np.histogram(vector,bins=bins,normed=False)
assert np.array_equal(b,bins)
return h,b
In [4]:
class Colors:
# From https://en.wikipedia.org/wiki/Web_colors
rgb = {
'blueish' : ['#0000FF',
'#9933FF',
'#0066FF'],
'greenish' : ['#00FF00',
'#66CC00',
'#009900'],
'redish' : ['#FF0000',
'#993300',
'#FF6600'],
'yellowish' : ['#FFFF00',
'#CCCC00',
'#999900']
}
mono = {
'grayish' : ['#000000'
'#808080'
'#FFFFFF']
}
#blue = blueish[0]
#red = redish[0]
#green = greenish[0]
#yellow = yellowish[0]
@classmethod
def get_colors(this,N,mode='sparse'):
"""
mode = sparse
"""
mode = mode if mode is 'sparse' else 'sparse'
from collections import OrderedDict
if mode is 'sparse':
groups = this.rgb.keys()
groups.sort()
color_groups = OrderedDict()
for g in groups:
color_groups[g] = this.rgb.get(g)[:]
#TODO: make this selection better, probably objectfying each color group
colors = []
while len(colors)<N:
for _g in color_groups.keys():
try:
colors.append(color_groups[_g].pop(0))
except IndexError,e:
raise IndexError,"You're asking me {0} colors, more than I have to offer.".format(N)
if len(colors) == N:
break
return colors
In [5]:
import bokeh
import numpy
class PlotHisto:
@staticmethod
def init_figure(tools = None,logscale=False):
"""
"""
from bokeh.plotting import figure
from bokeh.models.tools import PanTool,\
BoxZoomTool,\
WheelZoomTool,\
ResizeTool,\
ResetTool,\
HelpTool
from bokeh.models.tools import Tool
TOOLS = [PanTool(),BoxZoomTool(),WheelZoomTool(),ResizeTool(),ResetTool(),HelpTool()]
if tools:
if not isinstance(tools,(list,tuple)):
tools = [tools]
for t in tools:
if not issubclass(t.__class__,Tool):
continue
TOOLS.append(t)
#TOOLS = 'pan,box_zoom,wheel_zoom,crosshair,hover,resize,reset'
y_type = 'log' if logscale else 'linear'
fig = figure(tools=TOOLS,y_axis_type=y_type)
return fig
@staticmethod
def bar(counts,bins,label,color,alpha=0.5,figure=None):
"""
"""
bottoms = [0]*len(counts)
figure = PlotHisto.bar_stacked(tops=counts,
bottoms=bottoms,
bins=bins,
label=label,
color=color,
alpha=alpha,
figure=figure)
return figure
@staticmethod
def bar_stacked(tops,bottoms,bins,label,color,alpha=None,figure=None):
"""
"""
figure = PlotHisto._bar(tops,bottoms,bins[:-1],bins[1:],
label,color,alpha=alpha,figure=figure)
return figure
@staticmethod
def bar_adjacent(counts,lefts,rights,label,color,alpha=None,figure=None):
"""
"""
bottoms = [0]*len(counts)
figure = PlotHisto._bar(counts,bottoms,lefts,rights,
label,color,alpha=alpha,figure=figure)
return figure
@staticmethod
def _bar(tops,bottoms,lefts,rights,label,color,alpha,figure):
"""
"""
assert len(tops)==len(bottoms) or isinstance(bottoms,(int,float))
assert len(lefts)==len(rights)
if figure is None:
figure = init_figure()
figure.quad(top=tops,
bottom=bottoms,
left=lefts,
right=rights,
fill_color=color,fill_alpha=alpha,
legend=label)
return figure
@staticmethod
def step(counts,bins,label,color,figure):
"""
"""
_y,_x = histogram2stepfunction(counts,bins)
figure.line(x=_x,
y=_y,
#line_color="#D95B43",line_width=2,
line_color=color,line_width=2,
legend='label')
import numpy as np
_x = np.diff(bins)/2+bins[:-1]
figure.circle(x=_x,
y=_y,
#line_color="#D95B43",line_width=2,
line_color=color,line_width=2,
fill_color="white",fill_alpha=1,
size=9,
legend='label')
return figure
@staticmethod
def multihist(df,column,groupby,nbins=10,mode='over',logscale=False):
"""
mode = [over,stacked,adjacent]
"""
from bokeh.models import CrosshairTool,HoverTool
import numpy as np
fig = PlotHisto.init_figure(tools=[CrosshairTool(),HoverTool()],logscale=logscale)
label = column
fig.select(CrosshairTool).dimensions = ['height']
fig.select(HoverTool).mode = 'hline'
label_tip = "{}: ".format(label)
fig.select(HoverTool).tooltips = [(label_tip,"$x")]
fig.xgrid.grid_line_color = None #vgrid
fig.ygrid.minor_grid_line_color = 'gray' #hgrid
fig.ygrid.minor_grid_line_alpha = 0.5
fig.ygrid.minor_grid_line_dash = 'dashed'
fig.yaxis.axis_label = 'Counts'
fig.xaxis.axis_label = label
bins = Distro.binning(df[column],nbins,spacing='linear')
groups = df.groupby(groupby)
ngroups = len(groups)
colors = Colors.get_colors(ngroups)
counts_last = [0]*(len(bins)-1)
for i,grp_key in enumerate(groups.indices.keys()):
_data = df.loc[groups.groups[grp_key],(column)]
_counts,_bins = Distro.histogram(_data,bins)
if mode is 'stacked':
_counts = [ _counts[_i]+_bot for _i,_bot in enumerate(counts_last) ]
fig = PlotHisto.bar_stacked(_counts,counts_last,_bins,str(i),colors[i],figure=fig)
elif mode is 'adjacent':
bins_left,bins_right = [],[]
for _i in range(len(bins)-1):
_binsplit = np.linspace(bins[_i],bins[_i+1],ngroups+1)
bins_left.append(_binsplit[i])
bins_right.append(_binsplit[i+1])
fig = PlotHisto.bar_adjacent(_counts,bins_left,bins_right,str(i),colors[i],figure=fig)
else:
fig = PlotHisto.bar(_counts,bins,str(i),colors[i],alpha=float(1)/ngroups,figure=fig)
last_counts = _counts[:]
#else:
# _data = df.loc[grp.groups[grp_key],(column)]
# _counts,_bins = Distro.histogram(_data,bins)
# fig = PlotHisto.step(_counts,bins,i,fig)
return fig
In [6]:
class PlotBox:
@staticmethod
def boxplot(df,column,by,mean=True):
"""
"""
from bokeh.plotting import figure
from bokeh.models import Range1d
if mean:
df['mean'] = df.groupby(by)[column].transform(lambda x:x-x.mean())
groups = df.groupby(by)
if mean:
groups = groups['mean']
else:
groups = groups[column]
# Now a 'try' statement because I don't want to get in to the details when I'm not using Pandas.Categories.
try:
# When the 'by' argument is a DF.Categorical instance, we have to do some transforming.
# Notice that this Categorical (group) labels are strings containing the range of each bin,
# so that we get from them the "left,right" values for the 'bins' to use.
import re
_bins = set([ re.sub(r'[^\d.]+','',s) for c in df[by].values.categories for s in c.split(',') ])
_bins = list(_bins)
_bins.sort()
_bins = np.asarray(_bins,dtype=np.float)
except:
# Now, when I am not using Categories. In particular, now I'm passing "by" as a vector/Series
# containing numerical labels
assert False
_diff = np.diff(_bins)
_center = _bins[:-1] + _diff/2
# Generate some synthetic time series for six different categories
cats = [ s for s,g in groups ]
# Find the quartiles and IQR foor each category
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr
# find the outliers for each category
def outliers(group):
cat = group.name
return group[(group > upper.loc[cat][0]) | (group < lower.loc[cat][0])]
out = groups.apply(outliers).dropna()
# Prepare outlier data for plotting, we need coordinate for every outlier.
outx = []
outy = []
for i,cat in enumerate(cats):
# only add outliers if they exist
if not out.loc[cat].empty:
for value in out[cat]:
outx.append(_center[i])
outy.append(value)
p = figure(title="")
from bokeh.models import FixedTicker
p.x_range = Range1d(_bins.min(),_bins.max())
p.xaxis.ticker = FixedTicker(ticks=_center)
# If no outliers, shrink lengths of stems to be no longer than the minimums or maximums
qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)
upper = [ min([x,y]) for (x,y) in zip(qmax,upper) ]
lower = [ max([x,y]) for (x,y) in zip(qmin,lower) ]
# stems
p.segment(_center, upper, _center, q3, line_width=2, line_color="black")
p.segment(_center, lower, _center, q1, line_width=2, line_color="black")
# boxes
p.rect(_center, (q3+q2)/2, _diff/2, q3-q2, fill_color="#E08E79", line_width=2, line_color="black")
p.rect(_center, (q2+q1)/2, _diff/2, q2-q1, fill_color="#3B8686", line_width=2, line_color="black")
# whiskers (almost-0 height rects simpler than segments)
p.rect(_center, lower, _diff/4, 0.002, line_color="black")
p.rect(_center, upper, _diff/4, 0.002, line_color="black")
# outliers
p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)
p.xgrid.grid_line_color = None
p.ygrid.minor_grid_line_color = 'gray'
p.ygrid.minor_grid_line_alpha = 0.5
p.ygrid.minor_grid_line_dash = 'dashed'
p.xaxis.major_label_text_font_size="12pt"
p.xaxis.major_label_orientation = -3.14/2
p.xaxis.axis_label = by
p.yaxis.axis_label = column if not mean else column + ' (0-mean)'
return p
The catalog we're going to play here is the "best.observations.idz" spectroscopic catalogue.
Before proceeding, let me quote the website's ASCII catalogues section: "These ASCII catalogues are provided for the convenience of users of the 2dFGRS Final Data Release. However it is important to note that the mSQL database is the definitive data source, and contains all the information in the ASCII catalogues and much additional data."
That said, we can go to the data...ok, not so soon. We have to deal with the (ascii) catalogue and its columns format first. The best.observation.idz
columns description summary is the following:
#name type description
serial I6 Database serial number (=SEQNUM)
spectra I1 Number of spectra obtained
name A10 2dFGRS name (=NAME)
UKST A3 UKST plate (=IFIELD)
ra A11 R.A. (B1950)
dec A11 Dec. (B1950)
ra2000 A11 R.A. (J2000)
dec2000 A11 Dec. (J2000)
BJG F6.3 Final bj magnitude without extinction correction
BJSEL F6.3 Final bj magnitude with extinction correction
BJG_OLD F6.3 Original bj magnitude without extinction correction
BJSELOLD F6.3 Original bj magnitude with extinction correction
GALEXT F5.3 Galactic extinction value
SB_BJ F6.3 SuperCosmos bj magnitude without extinction correction
SR_R F6.3 SuperCosmos R magnitude without extinction correction
z F9.6 Best redshift (observed)
z_helio F9.6 Best redshift (heliocentric)
obsrun A5 Observation run of best spectrum
quality I1 Redshift quality parameter for best spectrum (reliable redshifts have quality>=3)
abemma I1 Redshift type (abs=1, emi=2, man=3)
Z_ABS F9.6 Cross-correlation redshift from best spectrum
KBESTR I1 Cross-correlation template from best spectrum
R_CRCOR F5.3 Cross-correlation R value from best spectrum
Z_EMI F9.6 Emission redshift from best spectrum
NMBEST I2 Number of emission lines for Z_EMI from best spectrum
SNR F6.2 Median S/N per pixel from best spectrum
ETA_TYPE F10.6 Eta spectral type parameter from best spectrum (-99.9 if none)
The catalogue (data file) itself has no header (i.e, column names).
So, after adding the column names header line, fixing the positional (ra,dec) columns (before: hh mm ss, after: hh:mm:ss) and subsituting the whitespaces to semicolons we have the new version of the catalogue, named best.observations.idz.csv
, which the first 5 lines -- 2 header, 3 data -- are the following:
#I6;I1;A10;A3;A11;A11;A11;A11;F6.3;F6.3;F6.3;F6.3;F5.3;F6.3;F6.3;F9.6;F9.6;A5;I1;I1;F9.6;I1;F5.3;F9.6;I2;F6.2;F10.6
serial;spectra;name;UKST;ra;dec;ra2000;dec2000;BJG;BJSEL;BJG_OLD;BJSELOLD;GALEXT;SB_BJ;SR_R;z;z_helio;obsrun;quality;abemma;Z_ABS;KBESTR;R_CRCOR;Z_EMI;NMBEST;SNR;ETA_TYPE
1;2;TGS436Z001;349;00:11:55.72;-32:32:55.2;00:14:27.05;-32:16:14.6;19.424;19.362;19.430;19.390;0.062;19.368;18.286;0.2981;0.2981;01SEP;4;1;0.2981;5;4.5700;0.2984;1;3.8;-99.90000
2;1;TGS496Z001;349;00:11:59.29;-33:14:41.3;00:14:30.55;-32:58:00.7;18.842;18.789;18.870;18.840;0.053;18.688;17.291;0.1229;0.1228;01OCT;5;1;0.1229;1;14.3800;-9.9990;0;47.6;-2.58920
3;1;TGS435Z001;349;00:11:49.37;-32:39:57.4;00:14:20.71;-32:23:16.8;18.320;18.265;18.350;18.310;0.055;18.336;17.138;0.1038;0.1038;01SEP;4;1;0.1038;1;9.3800;0.1032;1;28.4;-2.46500
This is the file we are going to deal now.
In [7]:
# The reading of this catalogue could be done either using Pandas or Astropy.
# To have some homogeneity (looking into the future) let's do it with Astropy.
import astropy
from astropy.io import ascii
tab = ascii.read('best.observations.idz.csv',
format='csv', delimiter=';',
header_start=1)
In [8]:
# Summary
tab.info
Out[8]:
Since the info provided regards the columns and data types, let's see what Pandas gives to us.
In [9]:
import pandas
df = tab.to_pandas()
In [10]:
pandas.set_option('display.max_columns',25)
pandas.set_option('display.width',1000)
df.describe()
Out[10]:
In [11]:
df.describe(include=['O'])
Out[11]:
In [12]:
class Coords:
@staticmethod
def sex2deg(ra,dec,frame='icrs'):
from astropy.coordinates import SkyCoord
from astropy import units
c = SkyCoord(ra=ra, dec=dec,
unit=(units.hourangle,units.deg),
frame=frame)
return c.ra.value,c.dec.value
In [13]:
# Since I am not interested on B1950 coordinates, I will just substitute those values with the J2000 in degrees
# First, though, let me copy the table
data = df.copy()
In [14]:
#data.ra,data.dec = Coords.sex2deg(data.ra2000,data.dec2000)
# And let me update the original(df) table with these values, just for the records ;)
#df.ra,df.dec = data.ra,data.dec
In [15]:
data = df[['z','quality','abemma']]
data.describe()
Out[15]:
In [16]:
%matplotlib inline
In [17]:
p = PlotDistro.boxhist(data,'z')
With the description summary and the plots above we see there are some "Null" values (i.e, "-9") in our data we were not aware of before (at least, I wasn't). Let's remove them by giving "None" to them.
In [18]:
_znil = data.z==-9
data.ix[_znil,'z'] = None
In [19]:
data.describe()
Out[19]:
In [20]:
p = PlotDistro.boxhist(data,'z',bins=20,log=True)
In [21]:
p = data.groupby('quality').z.plot.hist(bins=20,stacked=True)
In [22]:
from bokeh.io import output_notebook, show, output_file
output_notebook()
In [23]:
pq = PlotHisto.multihist(data,column='z',groupby='quality',nbins=30,mode='adjacent',logscale=True)
show(pq)
Out[23]:
In [24]:
pa = PlotHisto.multihist(data,column='z',groupby='abemma',nbins=30,mode='adjacent',logscale=True)
show(pa)
Out[24]:
The best catalog provides bj and R magnitudes, from the SuperCosmos[?]. There are also some other columns expressing different "versions" of magnitude bj:
BJG : Final bj magnitude without extinction correction
BJSEL : Final bj magnitude with extinction correction
BJG_OLD : Original bj magnitude without extinction correction
BJSELOLD : Original bj magnitude with extinction correction
GALEXT : Galactic extinction value
SB_BJ : SuperCosmos bj magnitude without extinction correction
SR_R : SuperCosmos R magnitude without extinction correction
For the time being -- since I don't really what are all does mags -- let me focus on SB_BJ
and SR_R
.
In [25]:
data['SB_BJ'] = df['SB_BJ']
data['SR_R'] = df['SR_R']
data.describe()
Out[25]:
In [26]:
_nil = data.z==-9
print('Number of z == -9 entries: {}'.format(_nil.sum()))
#print 'Indexes where it is "-9":',np.where(_nil)
data.z[_nil] = None
In [27]:
_nil = data.SB_BJ==0
print('Number of SB_BJ == 0 entries: {}'.format(_nil.sum()))
#print 'Indexes where it is "-9":',np.where(_nil)
data.SB_BJ[_nil] = None
In [28]:
_nil = data.SR_R==0
print('Number of SR_R == 0 entries: {}'.format(_nil.sum()))
#print 'Indexes where it is "-9":',np.where(_nil)
data.SR_R[_nil] = None
In [29]:
describe(data,showNullIndexes=False)
In [30]:
nbins=20
In [31]:
plt = PlotDistro.boxhist(data,column='SB_BJ',bins=nbins)
In [32]:
plt = PlotDistro.boxhist(data,column='SR_R',bins=20)
In [33]:
pbj = PlotHisto.multihist(data,column='SB_BJ',groupby='quality',mode='adjacent')
show(pbj)
Out[33]:
In [34]:
pr = PlotHisto.multihist(data,column='SR_R',groupby='quality',mode='adjacent')
show(pr)
Out[34]:
In [35]:
import pandas as pd
import numpy as np
mag_min = min(data['SR_R'].min(),data['SB_BJ'].min())
mag_max = max(data['SR_R'].max(),data['SB_BJ'].max())
mag_bins = np.linspace(mag_min,mag_max,nbins)
In [36]:
data['bins_SR'] = pd.cut(data['SR_R'],mag_bins)
p_boxplot_SR = PlotBox.boxplot(data,column='SR_R',by='bins_SR')
In [37]:
data['bins_SB'] = pd.cut(data['SB_BJ'],mag_bins)
p_boxplot_SB = PlotBox.boxplot(data,column='SB_BJ',by='bins_SB')
In [38]:
def hist_compare(df,columns,bins,logscale=False):
from bokeh.plotting import figure
from bokeh.models import CrosshairTool,HoverTool
TOOLS = 'pan,box_zoom,wheel_zoom,crosshair,hover,resize,reset'
if logscale:
p_hists = figure(tools=TOOLS,y_axis_type='log')
else:
p_hists = figure(tools=TOOLS)
p_hists.xgrid.grid_line_color = None
p_hists.ygrid.minor_grid_line_color = 'gray'
p_hists.ygrid.minor_grid_line_alpha = 0.5
p_hists.ygrid.minor_grid_line_dash = 'dashed'
p_hists.select(CrosshairTool).dimensions = ['height']
p_hists.select(HoverTool).mode = 'hline'
p_hists.select(HoverTool).tooltips = [("mag: ","$x")]
x_label = columns[0]
y_label = columns[1]
x = df[x_label]
y = df[y_label]
hs,b = np.histogram(x,bins=bins,normed=False)
p_hists.quad(top=hs,
bottom=0,
left=bins[:-1],
right=bins[1:],
fill_color="#036564",fill_alpha=0.5,
legend=x_label)
hp,b = np.histogram(y,bins=bins,normed=False)
hh,bb = histogram2stepfunction(hp,b)
p_hists.line(x=bb,
y=hh,
line_color="#D95B43",line_width=2,
legend=y_label)
_b = np.diff(bins)/2+bins[:-1]
p_hists.circle(x=_b,
y=hp,
size=9,line_color="#D95B43",line_width=2,
fill_color="white",fill_alpha=1,
legend=y_label)
p_hists.yaxis.axis_label = 'Counts'
return p_hists
In [39]:
p_hists = hist_compare(data,['SB_BJ','SR_R'],mag_bins,logscale=False)
In [40]:
p_hists.plot_height = 400
p_boxplot_SR.tools = p_hists.tools
p_boxplot_SR.x_range = p_hists.x_range
p_boxplot_SR.plot_height = p_hists.plot_height/2
p_boxplot_SR.xaxis.axis_label = None
p_boxplot_SR.ygrid.minor_grid_line_color = None
p_boxplot_SB.tools = p_hists.tools
p_boxplot_SB.x_range = p_hists.x_range
p_boxplot_SB.plot_height = p_hists.plot_height/2
p_boxplot_SB.xaxis.axis_label = None
p_boxplot_SB.ygrid.minor_grid_line_color = None
from bokeh.plotting import gridplot
p = gridplot([[p_hists],[p_boxplot_SB],[p_boxplot_SR]])
show(p)
Out[40]:
In [41]:
from bokeh import mpl
data.boxplot(column='SB_BJ',by='quality')
show(mpl.to_bokeh())
Out[41]:
In [42]:
from bokeh import mpl
data.boxplot(column='SR_R',by='quality')
show(mpl.to_bokeh())
Out[42]:
In [43]:
import plotly
print plotly.__version__
plotly.offline.init_notebook_mode()
In [44]:
import plotly.offline as plo
class Scatter3D:
@staticmethod
def plotly(df,columns):
from plotly.graph_objs import Scatter3d,Data
xl = columns[0]
yl = columns[1]
zl = columns[2]
pts = Scatter3d(x=data[xl], y=data[yl], z=data[zl],
mode='markers',marker=dict(size=10,symbol='circle',opacity=0.5))
p = Data([pts])
return p
@staticmethod
def matplotlib(df,columns):
import matplotlib
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
xl = columns[0] ; xs = df[xl]
yl = columns[1] ; ys = df[yl]
zl = columns[2] ; zs = df[zl]
ax.scatter(xs,ys,zs, marker='o')
ax.set_xlabel(xl)
ax.set_ylabel(yl)
ax.set_zlabel(zl)
return plt
In [45]:
#plo.plot(scatter3d(data,['z','SB_BJ','SR_R']))
In [46]:
# Let's sample data then...
data_sample = data.sample(frac=0.1)[['z','SB_BJ','SR_R']]
describe(data_sample)
In [47]:
#plo.iplot(Scatter3D.plotly(data_sample,['z','SB_BJ','SR_R']))
In [48]:
#%matplotlib notebook
%matplotlib inline
plt = Scatter3D.matplotlib(data,['z','SB_BJ','SR_R'])
In [49]:
class PlotScatter:
@staticmethod
def scatter_color(df,x_column,y_column,color_column,color_logscale=False):
from bokeh.plotting import figure
from bokeh.models import Range1d
title = "{0} .vs. {1}".format(x_column,y_column)
TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"
# create the scatter plot
p = figure(title = title,
min_border = 10,
min_border_left = 50,
tools=TOOLS)
p.xaxis.axis_label = x_column
xmin = df[x_column].min()
xmax = df[x_column].max()
p.x_range = Range1d(start = xmin,
end = xmax)
p.yaxis.axis_label = y_column
ymin = df[y_column].min()
ymax = df[y_column].max()
p.y_range = Range1d(start = ymin,
end = ymax)
def scalar2color(vector,logscale=False):
import numpy as np
if logscale:
vector = np.log(1+vector)
vmin = vector.min()
vmax = vector.max()
vfac = 255.0/(vmax-vmin)
colors = []
for v in vector:
if np.isnan(v):
clr = '#000000'
else:
r = int((v-vmin)*vfac)
g = int(255-((v-vmin)*vfac))
b = 0
clr = "#%02x%02x%02x" % (r,g,b)
colors.append(clr)
return colors
xs = df[x_column]
ys = df[y_column]
colors = scalar2color(df[color_column],color_logscale)
p.scatter(xs, ys, size=5, fill_color=colors, fill_alpha=0.5, line_color=None)
return p
In [50]:
#data_sample = data_sample.sort_values(by=['z'],ascending=True)
p = PlotScatter.scatter_color(data_sample,'SB_BJ','SR_R','z')
show(p)
Out[50]:
One thing that should be done is to define ways of spliting -- or binning -- a data vector.
For example, last plot is hard to visualize the z
(i.e, colors) coordinate -- because it's a dense plot, I have thought about level curves, which should clarify the visualization. To do so, we have to define what are the levels we want to define in our data (here, z
).
I have to move on on regression now, but here some links about density plots and contours. I read those links because I want to create a "SB_BJ .vs. SR_R" scatter plot smoothed on "z" values, as if I had an surface image, where the height of this surface-image would be proportional of the values of 'z' there. (see the "contours notebook") for motivation/memory.
In [51]:
data.columns
Out[51]:
In [52]:
data = data[[u'z', u'quality', u'abemma', u'SB_BJ', u'SR_R']]
In [53]:
describe(data)
I'll use scikit-learn regression algorithms to try it out. Those are documented on the 1st chapter of the documentation.
I will start with some famous in our community, for simplicity:
In [54]:
data = data.dropna()
describe(data)
In [55]:
train = data.sample(frac=0.8)
describe(train)
In [56]:
test = data.drop(train.index)
describe(test)
In [57]:
kn = 100
In [58]:
from sklearn.neighbors import KNeighborsRegressor as KNR
knn = KNR(kn,weights='uniform')
X = train[['SB_BJ','SR_R']]
y = train.z
train_fit = knn.fit(X,y)
In [59]:
X_test = test[['SB_BJ','SR_R']]
y_predict = train_fit.predict(X_test)
In [60]:
def scatter_kde(x,y,x_label,y_label,lim_scatter=None):
"""
"""
from bokeh.plotting import figure,gridplot
from bokeh.models import Range1d
title = "{} .vs. {}".format(x_label,y_label)
TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"
x_min,x_max = x.min(),x.max()
y_min,y_max = y.min(),y.max()
if lim_scatter is None:
_min = min(x_min,y_min)
_max = max(x_max,y_max)
lim_scatter = [_min,_max]
# create the scatter plot
p_scatter = figure(title = title,
min_border = 10,
min_border_left = 50,
tools=TOOLS)
p_scatter.x_range = Range1d(start = lim_scatter[0], end = lim_scatter[1])
p_scatter.xaxis.axis_label = x_label
p_scatter.y_range = Range1d(start = lim_scatter[0], end = lim_scatter[1])
p_scatter.yaxis.axis_label = y_label
p_scatter.line(lim_scatter, lim_scatter, color='black', line_width=2)
p_scatter.scatter(x, y, size=3, color="#3A5785", alpha=0.5)
# Kernel Density Estimates
#
def kde(values,vmin=None,vmax=None):
import numpy as np
from scipy.stats import gaussian_kde
kern = gaussian_kde(values)
vmin = vmin if vmin is not None else values.min()
vmax = vmax if vmax is not None else values.max()
_cov = kern.covariance[0][0]
pnts = np.arange(vmin, vmax, 10*_cov)
kde = kern(pnts)
return kde,pnts
# Create the HORIZONTAL plot (former histogram)
#
#LINE_ARGS = dict(color="#3A5785", line_color=None)
#
x_kde,x_grid = kde(x,x_min,x_max)
hmax = x_kde.max()*1.1
p_kde_spec = figure(title = None,
plot_width = p_scatter.plot_width,
plot_height = p_scatter.plot_height/3,
x_range = p_scatter.x_range,
y_range = (0, hmax),
min_border = 10,
min_border_left = 50,
toolbar_location = None,
tools=TOOLS)
p_kde_spec.xgrid.grid_line_color = None
#
p_kde_spec.line(x_grid, x_kde)#,**LINE_ARGS)
# Create the VERTICAL plot (former histogram)
#
th = 42 # need to adjust for toolbar height, unfortunately
#
y_kde,y_grid = kde(y,y_min,y_max)
vmax = y_kde.max()*1.1
p_kde_photo = figure(title = None,
plot_width = p_scatter.plot_width/3,
plot_height = p_scatter.plot_height+th-10,
x_range = (0, vmax),
y_range = p_scatter.y_range,
min_border = 10,
min_border_top = th,
toolbar_location = None,
tools=TOOLS)
p_kde_photo.ygrid.grid_line_color = None
#
p_kde_photo.line(y_kde, y_grid)
# Let's adjust the borders
#
p_kde_photo.min_border_top = 80
p_kde_photo.min_border_left = 0
p_kde_photo.min_border_bottom = 50
p_kde_spec.min_border_top = 10
p_kde_spec.min_border_right = 10
p_kde_spec.min_border_left = 80
p_scatter.min_border_right = 10
p_scatter.min_border_left = 80
p_scatter.min_border_bottom = 50
# Arrange them (the plots) to a regular grid
#
layout = gridplot([[p_scatter,p_kde_photo],[p_kde_spec,None]])
return layout
In [61]:
from bokeh.plotting import show
p = scatter_kde(test.z,y_predict,'z-real','z-estimated')
show(p)
Out[61]:
In [62]:
from sklearn.neighbors import KNeighborsRegressor as KNR
knn = KNR(kn,weights='distance')
X = train[['SB_BJ','SR_R']]
y = train.z
train_fit = knn.fit(X,y)
X_test = test[['SB_BJ','SR_R']]
y_predict = train_fit.predict(X_test)
In [63]:
from bokeh.plotting import show
p = scatter_kde(test.z,y_predict,'z-real','z-estimated')
show(p)
Out[63]:
I just realized the data has some bad quality redshift, quality <= 2
. We should clean it also.
In [65]:
rows_good = data.quality >= 3
good = data.loc[rows_good]
describe(good)
In [66]:
train = good.sample(frac=0.8)
test = good.drop(train.index)
In [67]:
from sklearn.neighbors import KNeighborsRegressor as KNR
knn = KNR(kn,weights='distance')
X = train[['SB_BJ','SR_R']]
y = train.z
train_fit = knn.fit(X,y)
X_test = test[['SB_BJ','SR_R']]
y_predict = train_fit.predict(X_test)
from bokeh.plotting import show
p = scatter_kde(test.z,y_predict,'z-real','z-estimated')
show(p)
Out[67]:
In [70]:
def scatter_hists(x,y,x_label,y_label,lim_scatter=None):
"""
"""
from bokeh.plotting import figure,gridplot
from bokeh.models import Range1d,BoxSelectTool,LassoSelectTool
import numpy as np
title = "{} .vs. {}".format(x_label,y_label)
TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"
x_min,x_max = x.min(),x.max()
y_min,y_max = y.min(),y.max()
if lim_scatter is None:
_min = min(x_min,y_min)
_max = max(x_max,y_max)
lim_scatter = [_min,_max]
x_bins = np.linspace(_min,_max,100)
y_bins = x_bins
# create the scatter plot
p_hist2d = figure(title=title,
x_range = lim_scatter,
y_range = lim_scatter,
plot_width=600,
plot_height=600,
min_border=10,
min_border_left=50,
tools=TOOLS)
p_hist2d.select(BoxSelectTool).select_every_mousemove = False
p_hist2d.select(LassoSelectTool).select_every_mousemove = False
p_hist2d.xaxis.axis_label = x_label
p_hist2d.yaxis.axis_label = y_label
p_hist2d.line(lim_scatter, lim_scatter, color='light_gray', line_width=1)
# We will not plot the usual colorful/heatmap-like histogram, but a size-scaled one
# So the next steps are to compute the histogram-2d itself, then clean it (no zero-counte)
# and (re)define the (x,y) grid to plot the points (scaled by the histogram bins counts).
#
hist2d, x_edges, y_edges = np.histogram2d(y, x, bins=(x_bins,y_bins))
assert np.array_equal(x_edges,x_bins)
assert np.array_equal(y_edges,y_bins)
# remove null bins
hist2d_y_ind, hist2d_x_ind = np.where(hist2d>0)
y_edges = y_edges[hist2d_y_ind]
x_edges = x_edges[hist2d_x_ind]
hist2d = hist2d[hist2d_y_ind, hist2d_x_ind]
# give them a size; no science here, just the old "looks-good"...
_hmin = hist2d.min()
_hmax = hist2d.max()
hist2d_scale = 2 + 10 * np.log2( (hist2d-_hmin) / (_hmax-_hmin) +1 )
# a DataFrame makes life easier when using Bokeh
#
_df = pd.DataFrame({'spec' :x_edges,
'photo':y_edges,
'scale':hist2d_scale})
r_hist2d = p_hist2d.square(x = _df['spec'],
y = _df['photo'],
size = _df['scale'],
color = "#3A5785",
alpha = 0.5)
#s = p.scatter(x=xs, y=xp, size=1, color="black")
# create the horizontal histogram
x_hist, hedges = np.histogram(x, bins=x_bins,
range=lim_scatter)
hzeros = np.zeros(len(x_hist)-1)
hmax = max(x_hist)*1.1
LINE_ARGS = dict(color="#3A5785", line_color=None)
p_hist_spec = figure(toolbar_location=None,
plot_width=p_hist2d.plot_width,
plot_height=200,
x_range=p_hist2d.x_range,
y_range=(-hmax, hmax),
title=None,
min_border=10,
min_border_left=50,
tools=TOOLS)
p_hist_spec.xgrid.grid_line_color = None
p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=x_hist, color="white", line_color="#3A5785")
hh1 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.5, **LINE_ARGS)
hh2 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)
# create the vertical histogram
y_hist, vedges = np.histogram(y, bins=y_bins,
range=lim_scatter)
vzeros = np.zeros(len(y_hist)-1)
vmax = max(y_hist)*1.1
th = 42 # need to adjust for toolbar height, unfortunately
p_hist_photo = figure(toolbar_location=None,
plot_width=200,
plot_height=p_hist2d.plot_height+th-10,
x_range=(-vmax, vmax),
y_range=p_hist2d.y_range,
title=None,
min_border=10,
min_border_top=th,
tools=TOOLS)
p_hist_photo.ygrid.grid_line_color = None
p_hist_photo.xaxis.major_label_orientation = -3.14/2
p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=y_hist, color="white", line_color="#3A5785")
vh1 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.5, **LINE_ARGS)
vh2 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.1, **LINE_ARGS)
p_hist_photo.min_border_top = 80
p_hist_photo.min_border_left = 0
p_hist_spec.min_border_top = 10
p_hist_spec.min_border_right = 10
p_hist2d.min_border_right = 10
layout = gridplot([[p_hist2d,p_hist_photo],
[p_hist_spec,None]])
def update(attr, old, new):
inds = np.array(new['1d']['indices'])
if len(inds) == 0 or len(inds) == len(x_edges):
hhist1, hhist2 = hzeros, hzeros
vhist1, vhist2 = vzeros, vzeros
else:
neg_inds = np.ones_like(x, dtype=np.bool)
neg_inds[inds] = False
hhist1, _ = np.histogram(x[inds_x], bins=hedges)
vhist1, _ = np.histogram(y[inds_xp], bins=vedges)
hhist2, _ = np.histogram(x[neg_inds_x], bins=hedges)
vhist2, _ = np.histogram(y[neg_inds_y], bins=vedges)
hh1.data_source.data["top"] = hhist1
hh2.data_source.data["top"] = -hhist2
vh1.data_source.data["right"] = vhist1
vh2.data_source.data["right"] = -vhist2
r_hist2d.data_source.on_change('selected', update)
return layout
In [71]:
from bokeh.plotting import show
p = scatter_hists(test.z,y_predict,'z-real','z-estimated')
show(p)
Out[71]:
In [ ]: